{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset = pd.read_csv('./weatherAUS.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset.drop(labels = ['Date','Location','Evaporation','Sunshine','Cloud3pm','Cloud9am','RISK_MM'],axis = 1,inplace = True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "    dataset['RainToday'].replace({'No':0,'Yes':1},inplace = True)\n",
    "    dataset['RainTomorrow'].replace({'No':0,'Yes':1},inplace = True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset.dropna(inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(112925, 17)"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(112925, 58)"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.preprocessing import OneHotEncoder\n",
    "\n",
    "X = dataset.drop(labels=['RainTomorrow'], axis=1)\n",
    "Y = dataset['RainTomorrow']\n",
    "\n",
    "# ohe = OneHotEncoder()\n",
    "# result = ohe.fit_transform(X)\n",
    "# result.shape\n",
    "\n",
    "\n",
    "categorical = ['WindGustDir','WindDir9am','WindDir3pm']\n",
    "X = pd.get_dummies(X, columns=categorical, drop_first=True)\n",
    "X.dropna().shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 0.11756741, -0.10822071, -0.20666127, ..., -0.27515781,\n",
       "         3.90999267, -0.27186642],\n",
       "       [-0.84180219,  0.20684494, -0.27640495, ..., -0.27515781,\n",
       "        -0.25575496,  3.67827705],\n",
       "       [ 0.03761995,  0.29277194, -0.27640495, ..., -0.27515781,\n",
       "        -0.25575496,  3.67827705],\n",
       "       ...,\n",
       "       [-1.44940294,  0.23548728, -0.27640495, ..., -0.27515781,\n",
       "        -0.25575496, -0.27186642],\n",
       "       [-1.16159206,  0.46462594, -0.27640495, ..., -0.27515781,\n",
       "         3.90999267, -0.27186642],\n",
       "       [-0.77784422,  0.4789471 , -0.27640495, ..., -0.27515781,\n",
       "        -0.25575496, -0.27186642]])"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.preprocessing import StandardScaler\n",
    "\n",
    "scaler = StandardScaler()\n",
    "X = scaler.fit_transform(X)\n",
    "X"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.model_selection import cross_val_score\n",
    "\n",
    "\n",
    "\n",
    "y = dataset['RainTomorrow']\n",
    "\n",
    "x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2)\n",
    "rfc = RandomForestClassifier(n_estimators=200, max_leaf_nodes=1000)\n",
    "# rfc_s = cross_val_score(rfc, x_train, y_train, cv=10, n_jobs=-1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Wall time: 0 ns\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "from sklearn.model_selection import GridSearchCV\n",
    "\n",
    "\n",
    "# 网格搜索\n",
    "# parameters = {\n",
    "#     'max_features':[14, 15, 16, 17],\n",
    "#     'max_leaf_nodes':np.arange(800, 1200, 50),\n",
    "#     'max_depth': np.arange(10, 20),\n",
    "# }\n",
    "\n",
    "# rfc_gs = GridSearchCV(rfc, parameters, cv=5, n_jobs=-1)\n",
    "\n",
    "# temp = rfc_gs.fit(x_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "# rfc_s"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "# from sklearn.model_selection import cross_validate\n",
    "\n",
    "# rfc_validation = cross_validate(rfc, x_train, y_train, cv=5, n_jobs=-1, return_estimator=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "# good_estimator = rfc_validation['estimator'][0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "# good_estimator"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "# sorted(good_estimator.feature_importances_, reverse=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "import joblib\n",
    "\n",
    "path = r'C:\\Users\\Liang\\Desktop\\fsdownload\\train_model.pkl'\n",
    "\n",
    "GS = joblib.load(path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.8884215187071065"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "GS.score(x_test, y_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'cv': 5,\n",
       " 'error_score': 'raise-deprecating',\n",
       " 'estimator__bootstrap': True,\n",
       " 'estimator__class_weight': None,\n",
       " 'estimator__criterion': 'gini',\n",
       " 'estimator__max_depth': None,\n",
       " 'estimator__max_features': 'auto',\n",
       " 'estimator__max_leaf_nodes': 1000,\n",
       " 'estimator__min_impurity_decrease': 0.0,\n",
       " 'estimator__min_impurity_split': None,\n",
       " 'estimator__min_samples_leaf': 1,\n",
       " 'estimator__min_samples_split': 2,\n",
       " 'estimator__min_weight_fraction_leaf': 0.0,\n",
       " 'estimator__n_estimators': 200,\n",
       " 'estimator__n_jobs': None,\n",
       " 'estimator__oob_score': False,\n",
       " 'estimator__random_state': None,\n",
       " 'estimator__verbose': 0,\n",
       " 'estimator__warm_start': False,\n",
       " 'estimator': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',\n",
       "                        max_depth=None, max_features='auto', max_leaf_nodes=1000,\n",
       "                        min_impurity_decrease=0.0, min_impurity_split=None,\n",
       "                        min_samples_leaf=1, min_samples_split=2,\n",
       "                        min_weight_fraction_leaf=0.0, n_estimators=200,\n",
       "                        n_jobs=None, oob_score=False, random_state=None,\n",
       "                        verbose=0, warm_start=False),\n",
       " 'iid': 'warn',\n",
       " 'n_jobs': -1,\n",
       " 'param_grid': {'max_features': [14, 15, 16, 17],\n",
       "  'max_leaf_nodes': array([ 800,  850,  900,  950, 1000, 1050, 1100, 1150], dtype=int64),\n",
       "  'max_depth': array([10, 11, 12, 13, 14, 15, 16, 17, 18, 19], dtype=int64)},\n",
       " 'pre_dispatch': '2*n_jobs',\n",
       " 'refit': True,\n",
       " 'return_train_score': False,\n",
       " 'scoring': None,\n",
       " 'verbose': 0}"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "GS.get_params()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "GS.predict_proba()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
